Predict sales prices and practice feature engineering, RFs, and gradient boosting

library(ggplot2)

PROJ_PATH <- '~/Documents/kaggle/house_prices'

train <- read.csv(file.path(PROJ_PATH, 'data/train.csv'))
test <- read.csv(file.path(PROJ_PATH, 'data/test.csv'))

nm <- names(train)

# Numeric variable indices
num_idx <- c(4,5,18,19,20,21,27,35,37,38,39,44,45,46,47,48,49,50,51,
              52,53,55,57,60,62,63,67,68,69,70,71,72,76,77,78)

# Categorical variable indices
cat_idx <- setdiff(2:(ncol(train)-1), num_idx)

hist2 <- function(..., breaks=30, col='darkgray', xlab=NULL){
  hist(..., breaks=breaks, col=col, border=col, xlab=xlab)
}

barplot2 <- function(..., col='darkgray', xlab=NULL){
  barplot(..., col=col, border=col, horiz=TRUE)
}

plot2 <- function(..., col=adjustcolor('gray30', alpha.f=0.2), bty='n'){
  plot(..., col=col, bty=bty)
}

Data description

dim(train)
## [1] 1460   81
names(train)
##  [1] "Id"            "MSSubClass"    "MSZoning"      "LotFrontage"  
##  [5] "LotArea"       "Street"        "Alley"         "LotShape"     
##  [9] "LandContour"   "Utilities"     "LotConfig"     "LandSlope"    
## [13] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"     
## [17] "HouseStyle"    "OverallQual"   "OverallCond"   "YearBuilt"    
## [21] "YearRemodAdd"  "RoofStyle"     "RoofMatl"      "Exterior1st"  
## [25] "Exterior2nd"   "MasVnrType"    "MasVnrArea"    "ExterQual"    
## [29] "ExterCond"     "Foundation"    "BsmtQual"      "BsmtCond"     
## [33] "BsmtExposure"  "BsmtFinType1"  "BsmtFinSF1"    "BsmtFinType2" 
## [37] "BsmtFinSF2"    "BsmtUnfSF"     "TotalBsmtSF"   "Heating"      
## [41] "HeatingQC"     "CentralAir"    "Electrical"    "X1stFlrSF"    
## [45] "X2ndFlrSF"     "LowQualFinSF"  "GrLivArea"     "BsmtFullBath" 
## [49] "BsmtHalfBath"  "FullBath"      "HalfBath"      "BedroomAbvGr" 
## [53] "KitchenAbvGr"  "KitchenQual"   "TotRmsAbvGrd"  "Functional"   
## [57] "Fireplaces"    "FireplaceQu"   "GarageType"    "GarageYrBlt"  
## [61] "GarageFinish"  "GarageCars"    "GarageArea"    "GarageQual"   
## [65] "GarageCond"    "PavedDrive"    "WoodDeckSF"    "OpenPorchSF"  
## [69] "EnclosedPorch" "X3SsnPorch"    "ScreenPorch"   "PoolArea"     
## [73] "PoolQC"        "Fence"         "MiscFeature"   "MiscVal"      
## [77] "MoSold"        "YrSold"        "SaleType"      "SaleCondition"
## [81] "SalePrice"

Histograms of numeric variables

par(mar=c(3,3,3,3))
par(mfrow = c(1, 4))

for (ni in num_idx){
  hist2(train[[ni]], main=nm[ni])
}

hist2(train[['SalePrice']], main='SalePrice')

Frequencies of categorical variables

par(las=2)
par(mar=c(3,4,3,3))
par(mfrow = c(1, 4))

for (ci in cat_idx){
  barplot2(table(train[[ci]]), main=nm[ci])
}

Missingness of variables (only those with any values missing)

missing_perc <- sort(sapply(train, function(x) sum(is.na(x)) / length(x)))

par(las=2)
par(mar=c(3,7,2,2))
barplot2(missing_perc[missing_perc>0],
         cex.names=0.6, cex.axis=0.6, xlim=c(0,1),
         main='Missingness')

Relation of each variable to price

par(mar=c(3,3,3,3))
par(mfrow = c(1, 4))

for (ni in num_idx){
  plot2(train[,c(nm[ni], 'SalePrice'),],
       main=nm[ni], ylim=range(train$SalePrice))
}

par(mar=c(3,3,3,3))
par(mfrow = c(1, 4))

for (ni in cat_idx){
  plot2(train[,c(nm[ni], 'SalePrice'),],
       main=nm[ni], ylim=range(train$SalePrice))
}